## 4 - Test remote inference with vLLM locally

Try to load the model and serve it via locally run vLLM interface. No model server is needed in this case, everything runs inside Jupyter

In [None]:
!pip install -U pip
!pip install boto3 vllm

Import required libraries:

- vLLM is used to perform local offline inference
- from vLLM we import the local LLM class that is used to load an HuggingFace transformer checkpoint
- also import the SamplingParams class that is used to customize inference parameters

In [None]:
import os, yaml
try:
    import boto3
    import torch.cuda as tc
    from vllm import LLM, SamplingParams
except Exception as e:
    print(f"Caught exception: {e}")
    sys.exit(-1)

Load parameters from the configuration yaml file

We'll need

- the checkpoint path
- the inference parameter configuration
- checkpoint file names
- S3 repo data connection pointers

In [None]:
# dictionary class that holds parameters
# load values from a yaml file
class Parameters(object):
    def __init__(self, data: dict):
        if type(data) != dict:
            raise TypeError(f"Parameters: expected 'dict', got {type(data)}.")
        else:
            self.data = data

        for k in self.data.keys():
            if type(self.data.get(k)) != dict:
                self.__setattr__(k, self.data.get(k))
            else:
                self.__setattr__(k, Parameters(self.data.get(k)))

# load parameters file and read values into a dictionary class
try:
    with open("parameters.yaml") as parms:
        config_parms = yaml.safe_load(parms)
    creds = Parameters(config_parms)
except yaml.YAMLError as e:
    print(f"Error loading YAML file: {e}")
    exit()
except Exception as e:
    print(f"Caught exception: {e}")
    exit()

Download the required model checkpoint locally from the relevant S3 bucket

In [None]:
# shamelessly stolen from aws docs :D
class ProgressPercentage(object):
    def __init__(self, filename):
        self._filename = filename
        self._size = float(os.path.getsize(filename))
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._size) * 100
            sys.stdout.write(
                "\r%s  %s / %s  (%.2f%%)" % (
                    self._filename, self._seen_so_far, self._size,
                    percentage))
            sys.stdout.flush()

# download model from s3 if needed
try:
    # connect to MinIO and prepare buckets
    print(f"Accessing S3 endpoint {creds.params.url} with ACCESS_KEY {creds.params.accessKey}...")

    # instantiate connection
    minio_api = boto3.client("s3", endpoint_url=creds.params.url, aws_access_key_id=creds.params.accessKey, aws_secret_access_key=creds.params.secretKey)
except Exception as e:
    print(f"Caught exception: {e}")

# create folder to store training data
mistral_models_path = "/".join((creds.huggingface.modelsPath, creds.huggingface.modelName))
os.makedirs(mistral_models_path, exist_ok=True)

# get list of data files
try:
    for file in creds.huggingface.filenames:
        if not os.path.exists("/".join((mistral_models_path, file))):
            print(f"Downloading file: {file} to {mistral_models_path}")
            minio_api.download_file(creds.huggingface.modelBucket, file,
                                        "/".join((mistral_models_path, file)))
        else:
            print(f"File {file} already downloaded.")
except Exception as e:
    print(f"Caught Exception {e}")

Now let's instantiate the model from the downloaded checkpoint

In [None]:
# load model and serve via vLLM
!nvidia-smi
# clear GPU mem, start from a clean situation
tc.empty_cache()

# load the model in vLLM
llm_model = LLM(model=mistral_models_path, max_model_len=creds.local_inference.vllm_model_len)

Once we have a model loaded & ready, let's prepare for inference

- set up a SamplingParams object set up with correct parameters
- perform inference
- translate back tokens to words

In [None]:
# display responses
def response(llm_output: list) -> str:
    for o in llm_output:
        prompt = o.prompt
        gen_text = o.outputs[0].text
        print(f"Prompt: {prompt!r},\n Generated text: {gen_text!r}")

# prepare parameters
sp = SamplingParams(min_tokens=creds.local_inference.min_tokens,
                        max_tokens=creds.local_inference.max_tokens,
                        temperature=creds.local_inference.temperature)

# test inference
prompt="Tell me about Red Hat Openshift"
responses = llm_model.generate(prompts=prompt, sampling_params=sp)

# display responses
response(responses)

Finally, free up resources allocated to the model

In [None]:
# free resources
if llm_model:
    del llm_model
tc.empty_cache()
!nvidia-smi