In [1]:
!pip install -U pip
!pip install boto3 vllm



In [2]:
import os, yaml
try:
    import boto3
    import vllm
except Exception as e:
    print(f"Caught exception: {e}")
    sys.exit(-1)

In [3]:
# dictionary class that holds parameters
# load values from a yaml file
class Parameters(object):
    def __init__(self, data: dict):
        if type(data) != dict:
            raise TypeError(f"Parameters: expected 'dict', got {type(data)}.")
        else:
            self.data = data

        for k in self.data.keys():
            if type(self.data.get(k)) != dict:
                self.__setattr__(k, self.data.get(k))
            else:
                self.__setattr__(k, Parameters(self.data.get(k)))

# load parameters file and read values into a dictionary class
try:
    with open("parameters.yaml") as parms:
        config_parms = yaml.safe_load(parms)
    creds = Parameters(config_parms)
except yaml.YAMLError as e:
    print(f"Error loading YAML file: {e}")
    exit()
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

In [4]:
# shamelessly stolen from aws docs :D
class ProgressPercentage(object):
    def __init__(self, filename):
        self._filename = filename
        self._size = float(os.path.getsize(filename))
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s  %s / %s  (%.2f%%)" % (
                    self._filename, self._seen_so_far, self._size,
                    percentage))
            sys.stdout.flush()

# download model from s3 if needed
try:
    # connect to MinIO and prepare buckets
    print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

    # instantiate connection
    minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)
except Exception as e:
    print(f"Caught exception: {e}")

# create folder to store training data
mistral_models_path = "/".join((creds.huggingface.modelsPath, creds.huggingface.modelName))
os.makedirs(mistral_models_path, exist_ok=True)

# get list of data files
try:
    for file in creds.huggingface.filenames:
        if not os.path.exists("/".join((mistral_models_path, file))):
            print(f"Downloading file: {file} to {mistral_models_path}")
            minio_api.download_file(creds.huggingface.modelBucket, file,
                                        "/".join((mistral_models_path, file)))
        else:
            print(f"File {file} already downloaded.")
except Exception as e:
    print(f"Caught Exception {e}")

Accessing S3 endpoint http://minio.ic-shared-minio.svc.cluster.local:9000 with ACCESS_KEY buTNe4R2AVxYK4RDCOEy...
File model-00001-of-00004.safetensors already downloaded.
File model-00002-of-00004.safetensors already downloaded.
File model-00003-of-00004.safetensors already downloaded.
File model-00004-of-00004.safetensors already downloaded.
File model.safetensors.index.json already downloaded.
File generation_config.json already downloaded.
File vocab.json already downloaded.
File config.json already downloaded.
File added_tokens.json already downloaded.
File tokenizer.json already downloaded.
File tokenizer_config.json already downloaded.
File special_tokens_map.json already downloaded.


In [5]:
# load model and serve via vLLM
llm_model = vllm.LLM(model=mistral_models_path)

INFO 11-22 13:36:49 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='model_checkpoints/ibm-granite/granite-3.0-8b-instruct', speculative_config=None, tokenizer='model_checkpoints/ibm-granite/granite-3.0-8b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=model_checkpoints/ibm-granite/granite-3.0-8b-instruct, num_scheduler_steps=1, chunked_prefill_en

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 11-22 13:38:54 model_runner.py:1077] Loading model weights took 15.2208 GB
INFO 11-22 13:38:56 worker.py:232] Memory profiling results: total_gpu_memory=21.98GiB initial_memory_usage=15.51GiB peak_torch_memory=15.71GiB memory_usage_post_profile=15.55GiB non_torch_memory=0.32GiB kv_cache_size=3.75GiB gpu_memory_utilization=0.90
INFO 11-22 13:38:56 gpu_executor.py:113] # GPU blocks: 1536, # CPU blocks: 1638
INFO 11-22 13:38:56 gpu_executor.py:117] Maximum concurrency for 4096 tokens per request: 6.00x
INFO 11-22 13:39:01 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-22 13:39:01 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO

In [6]:
# test inference
prompt="Tell me about Red Hat Openshift"
responses = llm_model.generate(prompt)
for r in responses:
    prompt = r.prompt
    gen_text = r.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {gen_text!r}")

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.54it/s, est. speed input: 10.81 toks/s, output: 24.70 toks/s]

Prompt: 'Tell me about Red Hat Openshift', Generated text: '. What is its use?\n\nRed Hat OpenShift is a cloud development'





In [7]:
# free resources
del llm_model