In [1]:
!pip install -U pip
!pip install transformers accelerate tqdm bitsandbytes
!pip install boto3



In [2]:
# import required modules
import os, yaml
try:
    import sys, threading
    import boto3
    from boto3.s3.transfer import TransferConfig
    from botocore.exceptions import ClientError
    import torch
    import torch.cuda as tc
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from transformers import BitsAndBytesConfig
except Exception as e:
    print(f"Caught Exception: {e}")

# max number of generated tokens
max_tokens = 40

# shamelessly stolen from aws docs :D
class ProgressPercentage(object):
    def __init__(self, filename):
        self._filename = filename
        self._size = float(os.path.getsize(filename))
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s  %s / %s  (%.2f%%)" % (
                    self._filename, self._seen_so_far, self._size,
                    percentage))
            sys.stdout.flush()

In [3]:
# dictionary class that holds parameters
# load values from a yaml file
class Parameters(object):
    def __init__(self, data: dict):
        if type(data) != dict:
            raise TypeError(f"Parameters: expected 'dict', got {type(data)}.")
        else:
            self.data = data

        for k in self.data.keys():
            if type(self.data.get(k)) != dict:
                self.__setattr__(k, self.data.get(k))
            else:
                self.__setattr__(k, Parameters(self.data.get(k)))

# load parameters file and read values into a dictionary class
try:
    with open("parameters.yaml") as parms:
        config_parms = yaml.safe_load(parms)
    creds = Parameters(config_parms)
except yaml.YAMLError as e:
    print(f"Error loading YAML file: {e}")
    exit()
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

In [4]:
# download model from s3 if needed
try:
    # connect to MinIO and prepare buckets
    print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

    # instantiate connection
    minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)
except Exception as e:
    print(f"Caught exception: {e}")

# create folder to store training data
mistral_models_path = "/".join((creds.huggingface.modelsPath, creds.huggingface.modelName))
os.makedirs(mistral_models_path, exist_ok=True)

# get list of data files
try:
    for file in creds.huggingface.filenames:
        if not os.path.exists("/".join((mistral_models_path, file))):
            print(f"Downloading file: {file} to {mistral_models_path}")
            minio_api.download_file(creds.huggingface.modelBucket, file,
                                        "/".join((mistral_models_path, file)))
        else:
            print(f"File {file} already downloaded.")
except Exception as e:
    print(f"Caught Exception {e}")

Accessing S3 endpoint http://minio-svc.minio.svc.cluster.local:9000 with ACCESS_KEY O3wC8Aoi1e46YSoJerUm...
File model-00001-of-00003.safetensors already downloaded.
File model-00002-of-00003.safetensors already downloaded.
File model-00003-of-00003.safetensors already downloaded.
File model.safetensors.index.json already downloaded.
File config.json already downloaded.
File params.json already downloaded.
File tokenizer.model already downloaded.
File tokenizer.json already downloaded.
File tokenizer_config.json already downloaded.
File special_tokens_map.json already downloaded.


In [5]:
# make sure that an accelerator is attached to the pytorch runtime
accelerator = "cpu"
if tc.is_available():
    print(f"Torch reports: CUDA is available, {tc.device_count()} GPU")
    print(f"Accelerator model: {tc.get_device_name()}")
    accelerator = "cuda"

# query nvidia card via SMI
!nvidia-smi

# report detected accelerator
print(f"Using accelerator '{accelerator}'")

Torch reports: CUDA is available, 1 GPU
Accelerator model: Tesla V100-SXM2-16GB
Thu Jun 20 11:50:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           On  | 00000000:00:1E.0 Off |                    0 |
| N/A   32C    P0              25W / 300W |      2MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+------------

In [6]:
# mistral is a huge model
# we may run into trouble when converting it to ONNX since we only have 16GB of VRAM
# so we quantize the weights of the model to reduce the size:
# load the model with 4bit precision (1/4 size)
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

In [7]:
# load tokenizer model from pretrained checkpoint
mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_models_path,
                                                  torch_dtype=torch.float16,
                                                  device_map="auto",
                                                  quantization_config=quantization_config,
                                                  use_safetensors=True)

In [8]:
# load llm from pretrained checkpoint
mistral_model = AutoModelForCausalLM.from_pretrained(mistral_models_path,
                                                     torch_dtype=torch.float16,
                                                     device_map="auto",
                                                     quantization_config=quantization_config,
                                                     use_safetensors=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
# model has been loaded
!nvidia-smi
print(f"Memory footprint of the current model (4-bit quantized): {mistral_model.get_memory_footprint()/(1024*1024*1024)}GB of VRAM")

Thu Jun 20 11:50:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           On  | 00000000:00:1E.0 Off |                    0 |
| N/A   33C    P0              41W / 300W |   5312MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [10]:
# test tokenizer
input_tokens = mistral_tokenizer("Tell me about RedHat", return_tensors="pt").to(accelerator)
print(input_tokens)

{'input_ids': tensor([[    1, 16027,  1296,  1452,  4458, 29537,  1038]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [11]:
# test inference
output_tokens = mistral_model.generate(**input_tokens, max_new_tokens=max_tokens)
print(output_tokens)

# show generated message
print(mistral_tokenizer.decode(output_tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


tensor([[    1, 16027,  1296,  1452,  4458, 29537,  1038, 29491,   781,   781,
          8284, 29537,  1038,  1117,  1032, 20254,  6791, 29491,  1429,  1117,
          1032,  2701, 11281,  2355, 29491,  1429,  1117,  1032,  2701, 11281,
          2355, 29491,  1429,  1117,  1032,  2701, 11281,  2355, 29491,  1429,
          1117,  1032,  2701, 11281,  2355, 29491,  1429]], device='cuda:0')
Tell me about RedHat.

RedHat is a Linux distribution. It is a free operating system. It is a free operating system. It is a free operating system. It is a free operating system. It


In [12]:
# save data to local disk
try:
    # OK, model works, prepare disk path for saving
    quantized_savepath = "/".join((creds.huggingface.modelsPath, "quantized"))
    os.makedirs(quantized_savepath, exist_ok=True)

    # save the quantized pretrained model to disk
    mistral_model.save_pretrained(quantized_savepath)
except Exception as e:
    print(f"Caught exception: {e}")

In [13]:
# upload quantized model to s3
quantized_model_files = [ "model.safetensors", "generation_config.json", "config.json" ]
tokenizer_model_files = [ "tokenizer.model", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json" ]

# connect to MinIO and prepare buckets
print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

# instantiate connection
minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)

# checks whether a file exists in a remote bucket
def check_exists(s3api, bucket, filename):
    rsp = s3api.list_objects_v2(Bucket=bucket, Prefix=filename)
    try:
        contents = rsp.get("Contents")
        files = [ obj.get("Key") for obj in contents ]
        if filename in files:
            return True
        else:
            return False
    except Exception as e:
        return False

# Set the desired multipart threshold value (5GB)
GB = 1024 ** 3
transfer_config = TransferConfig(multipart_threshold = 5*GB, use_threads=True)
try:
    for k in tokenizer_model_files:
        if not check_exists(minio_api, creds.quantized_model.quantizedBucket, k):
            print(f"Uploading {k} to MinIO bucket {creds.quantized_model.quantizedBucket}")
            filepath = "/".join((mistral_models_path, k))
            minio_api.upload_file(filepath, creds.quantized_model.quantizedBucket,
                                    k,
                                    Callback=ProgressPercentage(filepath),
                                    Config=transfer_config)
            print("---")
        else:
            print(f"File {k} already exists in {creds.quantized_model.quantizedBucket}")

    for k in quantized_model_files:
        if not check_exists(minio_api, creds.quantized_model.quantizedBucket, k):
            print(f"Uploading {k} to MinIO bucket {creds.quantized_model.quantizedBucket}")
            filepath = "/".join((quantized_savepath, k))
            minio_api.upload_file(filepath, creds.quantized_model.quantizedBucket,
                                    k,
                                    Callback=ProgressPercentage(filepath),
                                    Config=transfer_config)
            print("---")
        else:
            print(f"File {k} already exists in {creds.quantized_model.quantizedBucket}")
except ClientError as e:
    print(f"S3 Exception: {e.response['Error']['Code']}, trace: {e}")
except Exception as e:
    print(f"Caught exception: {e}")

print("Upload Complete.")

Accessing S3 endpoint http://minio-svc.minio.svc.cluster.local:9000 with ACCESS_KEY O3wC8Aoi1e46YSoJerUm...
File tokenizer.model already exists in quantized
File tokenizer.json already exists in quantized
File tokenizer_config.json already exists in quantized
File special_tokens_map.json already exists in quantized
File model.safetensors already exists in quantized
File generation_config.json already exists in quantized
File config.json already exists in quantized
Upload Complete.


In [14]:
#clear vram and unload live model from gpu
if mistral_model:
    del mistral_model
    tc.empty_cache()

# show CUDA status
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Thu Jun 20 11:51:30 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           On  | 00000000:00:1E.0 Off |                    0 |
| N/A   34C    P0              41W / 300W |   5346MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    