In [1]:
!pip install -U pip
!pip install boto3 vllm



Import required libraries:

- vLLM is used to perform local offline inference
- from vLLM we import the local LLM class that is used to load an HuggingFace transformer checkpoint
- also import the SamplingParams class that is used to customize inference parameters

In [8]:
import os, yaml
try:
    import boto3
    from vllm import LLM, SamplingParams
except Exception as e:
    print(f"Caught exception: {e}")
    sys.exit(-1)

Load parameters from the configuration yaml file

We'll need

- the checkpoint path
- the inference parameter configuration
- checkpoint file names
- S3 repo data connection pointers

In [9]:
# dictionary class that holds parameters
# load values from a yaml file
class Parameters(object):
    def __init__(self, data: dict):
        if type(data) != dict:
            raise TypeError(f"Parameters: expected 'dict', got {type(data)}.")
        else:
            self.data = data

        for k in self.data.keys():
            if type(self.data.get(k)) != dict:
                self.__setattr__(k, self.data.get(k))
            else:
                self.__setattr__(k, Parameters(self.data.get(k)))

# load parameters file and read values into a dictionary class
try:
    with open("parameters.yaml") as parms:
        config_parms = yaml.safe_load(parms)
    creds = Parameters(config_parms)
except yaml.YAMLError as e:
    print(f"Error loading YAML file: {e}")
    exit()
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

Download the required model checkpoint locally from the relevant S3 bucket

In [11]:
# shamelessly stolen from aws docs :D
class ProgressPercentage(object):
    def __init__(self, filename):
        self._filename = filename
        self._size = float(os.path.getsize(filename))
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s  %s / %s  (%.2f%%)" % (
                    self._filename, self._seen_so_far, self._size,
                    percentage))
            sys.stdout.flush()

# download model from s3 if needed
try:
    # connect to MinIO and prepare buckets
    print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

    # instantiate connection
    minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)
except Exception as e:
    print(f"Caught exception: {e}")

# create folder to store training data
mistral_models_path = "/".join((creds.huggingface.modelsPath, creds.huggingface.modelName))
os.makedirs(mistral_models_path, exist_ok=True)

# get list of data files
try:
    for file in creds.huggingface.filenames:
        if not os.path.exists("/".join((mistral_models_path, file))):
            print(f"Downloading file: {file} to {mistral_models_path}")
            minio_api.download_file(creds.huggingface.modelBucket, file,
                                        "/".join((mistral_models_path, file)))
        else:
            print(f"File {file} already downloaded.")
except Exception as e:
    print(f"Caught Exception {e}")

Accessing S3 endpoint http://minio.ic-shared-minio.svc.cluster.local:9000 with ACCESS_KEY buTNe4R2AVxYK4RDCOEy...
File model-00001-of-00003.safetensors already downloaded.
File model-00002-of-00003.safetensors already downloaded.
File model-00003-of-00003.safetensors already downloaded.
File model.safetensors.index.json already downloaded.
File generation_config.json already downloaded.
File tokenizer_config.json already downloaded.
File config.json already downloaded.
File added_tokens.json already downloaded.
File tokenizer.json already downloaded.
File tokenizer.model already downloaded.
File tokenizer_config.json already downloaded.
File special_tokens_map.json already downloaded.
File added_tokens.json already downloaded.


Now let's instantiate the model from the downloaded checkpoint

In [12]:
# load model and serve via vLLM
llm_model = vllm.LLM(model=mistral_models_path)

INFO 11-25 13:29:06 config.py:1861] Downcasting torch.float32 to torch.float16.
INFO 11-25 13:29:06 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 11-25 13:29:06 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='model_checkpoints/ibm-granite/granite-7b-instruct', speculative_config=None, tokenizer='model_checkpoints/ibm-granite/granite-7b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=No

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 11-25 13:30:49 model_runner.py:1077] Loading model weights took 12.5523 GB
INFO 11-25 13:30:50 worker.py:232] Memory profiling results: total_gpu_memory=21.98GiB initial_memory_usage=13.09GiB peak_torch_memory=12.91GiB memory_usage_post_profile=13.09GiB non_torch_memory=0.53GiB kv_cache_size=6.34GiB gpu_memory_utilization=0.90
INFO 11-25 13:30:50 gpu_executor.py:113] # GPU blocks: 811, # CPU blocks: 512
INFO 11-25 13:30:50 gpu_executor.py:117] Maximum concurrency for 4096 tokens per request: 3.17x
INFO 11-25 13:30:50 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-25 13:30:50 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Once we have a model loaded & ready, let's prepare for inference

- set up a SamplingParams object set up with correct parameters
- perform inference
- translate back tokens to words

In [17]:
# display responses
def response(llm_output: list) -> str:
    for o in llm_output:
        prompt = r.prompt
        gen_text = r.outputs[0].text
        print(f"Prompt: {prompt!r},\n Generated text: {gen_text!r}")

# prepare parameters
sp = SamplingParams(min_tokens=creds.local_inference.min_tokens,
                        max_tokens=creds.local_inference.max_tokens,
                        temperature=creds.local_inference.temperature)

# test inference
prompt="Tell me about Red Hat Openshift"
responses = llm_model.generate(prompts=prompt, sampling_params=sp)

# display responses
response(responses)

Processed prompts: 100%|██████████| 1/1 [00:08<00:00,  8.86s/it, est. speed input: 1.02 toks/s, output: 32.63 toks/s]

Prompt: 'Tell me about Red Hat Openshift',
 Generated text: ".2.x and Red Hat OpenShift Container Platform  latest versions.\n\nA:\n\nRed Hat OpenShift is a containerization platform based on Kubernetes, which allows developers to build, deploy, and manage containerized applications. The latest version of Red Hat OpenShift is OpenShift 4.11, which was released in November 2021.\n\nRed Hat OpenShift 4.11 introduces several new features and improvements, including:\n\n1. Improved performance and scalability: OpenShift 4.11 includes performance and scalability improvements, such as better memory management and faster container startup times.\n2. Enhanced security: OpenShift 4.11 includes new security features, such as the ability to securely manage container images using the Secrets Manager and the ability to enable multi-factor authentication (MFA) for cluster administrators.\n3. Simplified management: OpenShift 4.11 includes simplified management features, such as the ability to automat




Finally, free up resources allocated to the model

In [18]:
# free resources
del llm_model