## 3 - Load Mistral Model and Transform the checkpoint

Mistral is a moderately big model (we are currently using the 7B variant), so it might be hard to load it into a low-end device (e.g. a GPU with less than 16GB of VRAM)

This notebook explains how to:
* Load a full checkpoint
* Define a quantization profile (in this case a 4-bit weight quantization)
* Quantize the model via HuggingFace transformers using a GPU if it is available
* Upload the resulting Quantized model to an S3 Storage for future usage

In [None]:
!pip install -U pip
!pip install transformers accelerate tqdm bitsandbytes sentencepiece
!pip install boto3

In [None]:
# import required modules
import os, yaml
try:
    import sys, threading
    import boto3
    from boto3.s3.transfer import TransferConfig
    from botocore.exceptions import ClientError
    import torch
    import torch.cuda as tc
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from transformers import BitsAndBytesConfig
except Exception as e:
    print(f"Caught Exception: {e}")

# max number of generated tokens
max_tokens = 40

# shamelessly stolen from aws docs :D
class ProgressPercentage(object):
    def __init__(self, filename):
        self._filename = filename
        self._size = float(os.path.getsize(filename))
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s  %s / %s  (%.2f%%)" % (
                    self._filename, self._seen_so_far, self._size,
                    percentage))
            sys.stdout.flush()

In [None]:
# dictionary class that holds parameters
# load values from a yaml file
class Parameters(object):
    def __init__(self, data: dict):
        if type(data) != dict:
            raise TypeError(f"Parameters: expected 'dict', got {type(data)}.")
        else:
            self.data = data

        for k in self.data.keys():
            if type(self.data.get(k)) != dict:
                self.__setattr__(k, self.data.get(k))
            else:
                self.__setattr__(k, Parameters(self.data.get(k)))

# load parameters file and read values into a dictionary class
try:
    with open("parameters.yaml") as parms:
        config_parms = yaml.safe_load(parms)
    creds = Parameters(config_parms)
except yaml.YAMLError as e:
    print(f"Error loading YAML file: {e}")
    exit()
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

Download the full model from a pre-loaded storage endpoint (MinIO in this case)

In [None]:
# download model from s3 if needed
try:
    # connect to MinIO and prepare buckets
    print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

    # instantiate connection
    minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)
except Exception as e:
    print(f"Caught exception: {e}")

# create folder to store training data
mistral_models_path = "/".join((creds.huggingface.modelsPath, creds.huggingface.modelName))
os.makedirs(mistral_models_path, exist_ok=True)

# get list of data files
try:
    for file in creds.huggingface.filenames:
        if not os.path.exists("/".join((mistral_models_path, file))):
            print(f"Downloading file: {file} to {mistral_models_path}")
            minio_api.download_file(creds.huggingface.modelBucket, file,
                                        "/".join((mistral_models_path, file)))
        else:
            print(f"File {file} already downloaded.")
except Exception as e:
    print(f"Caught Exception {e}")

Detect available CUDA Accelerators. Use the GPU if one is found, otherwise fallback to CPU operation.

In [None]:
# make sure that an accelerator is attached to the pytorch runtime
accelerator = "cpu"
if tc.is_available():
    print(f"Torch reports: CUDA is available, {tc.device_count()} GPU")
    print(f"Accelerator model: {tc.get_device_name()}")
    accelerator = "cuda"

# query nvidia card via SMI
!nvidia-smi

# report detected accelerator
print(f"Using accelerator '{accelerator}'")

# Define a Quantization Profile

Define a quantized profile (use 4bit precision instead of full precision) in order to reduce the model size
This can easily be done with HuggingFace transformers library and the BitsAndBytes library

cfr. https://huggingface.co/blog/4bit-transformers-bitsandbytes

In [None]:
# mistral is a huge model
# we may run into trouble when converting it to ONNX since we only have 16GB of VRAM
# so we quantize the weights of the model to reduce the size:
# load the model with 4bit precision (1/4 size)
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

In [None]:
# load tokenizer model from pretrained checkpoint
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_models_path,
                                                  torch_dtype=torch.float16,
                                                  device_map="auto",
                                                  quantization_config=quantization_config,
                                                  use_safetensors=True)

In [None]:
# load llm from pretrained checkpoint
mistral_model = AutoModelForCausalLM.from_pretrained(mistral_models_path,
                                                     torch_dtype=torch.float16,
                                                     device_map="auto",
                                                     quantization_config=quantization_config,
                                                     use_safetensors=True)

Now check the size of the resulting quantized model. Note that the size of the checkpoint has been really cut down.

In [None]:
# model has been loaded
!nvidia-smi
print(f"Memory footprint of the current model (4-bit quantized): {mistral_model.get_memory_footprint()/(1024*1024*1024)}GB of VRAM")

Test inference with the resulting model to ensure that it does indeed work

In [None]:
# test tokenizer
input_tokens = mistral_tokenizer("Tell me about RedHat", return_tensors="pt").to(accelerator)
print(input_tokens)

In [None]:
# test inference
output_tokens = mistral_model.generate(**input_tokens, max_new_tokens=max_tokens)
print(output_tokens)

# show generated message
print(mistral_tokenizer.decode(output_tokens[0], skip_special_tokens=True))

# Finally upload the model to S3

After quantization and test, the resulting 4-bit checkpoint is uploaded to S3 for storage

In [None]:
# save data to local disk
try:
    # OK, model works, prepare disk path for saving
    quantized_savepath = "/".join((creds.huggingface.modelsPath, "quantized"))
    os.makedirs(quantized_savepath, exist_ok=True)

    # save the quantized pretrained model to disk
    mistral_model.save_pretrained(quantized_savepath)
except Exception as e:
    print(f"Caught exception: {e}")

In [None]:
# upload quantized model to s3
quantized_model_files = [ "model.safetensors", "generation_config.json", "config.json" ]
tokenizer_model_files = [ "tokenizer.model", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json" ]

# connect to MinIO and prepare buckets
print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

# instantiate connection
minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)

# checks whether a file exists in a remote bucket
def check_exists(s3api, bucket, filename):
    rsp = s3api.list_objects_v2(Bucket=bucket, Prefix=filename)
    try:
        contents = rsp.get("Contents")
        files = [ obj.get("Key") for obj in contents ]
        if filename in files:
            return True
        else:
            return False
    except Exception as e:
        return False

# Set the desired multipart threshold value (5GB)
GB = 1024 ** 3
transfer_config = TransferConfig(multipart_threshold = 5*GB, use_threads=False)
try:
    for k in tokenizer_model_files:
        if not check_exists(minio_api, creds.quantized_model.quantizedBucket, k):
            print(f"Uploading {k} to MinIO bucket {creds.quantized_model.quantizedBucket}")
            filepath = "/".join((mistral_models_path, k))
            minio_api.upload_file(filepath, creds.quantized_model.quantizedBucket,
                                    k,
                                    Callback=ProgressPercentage(filepath),
                                    Config=transfer_config)
            print("---")
        else:
            print(f"File {k} already exists in {creds.quantized_model.quantizedBucket}")

    for k in quantized_model_files:
        if not check_exists(minio_api, creds.quantized_model.quantizedBucket, k):
            print(f"Uploading {k} to MinIO bucket {creds.quantized_model.quantizedBucket}")
            filepath = "/".join((quantized_savepath, k))
            minio_api.upload_file(filepath, creds.quantized_model.quantizedBucket,
                                    k,
                                    Callback=ProgressPercentage(filepath),
                                    Config=transfer_config)
            print("---")
        else:
            print(f"File {k} already exists in {creds.quantized_model.quantizedBucket}")
except ClientError as e:
    print(f"S3 Exception: {e.response['Error']['Code']}, trace: {e}")
except Exception as e:
    print(f"Caught exception: {e}")

print("Upload Complete.")

In [None]:
# clear vram and unload live model from gpu
if mistral_model:
    del mistral_model
    tc.empty_cache()

# show CUDA status
!nvidia-smi