In [1]:
import os

# This is needed for vLLM to use multiple GPUs in a notebook.
# If you're not running in a notebook, you can ignore this.
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

In [None]:
import torch

from oumi.core.configs import InferenceConfig
from oumi.core.types.turn import Conversation, Message, Role
from oumi.inference import VLLMInferenceEngine

### Setting up the environment

Make sure you have the `oumi` package installed: `pip install ".[train,dev]"`

Next, make sure to install the GPU dependencies: `pip install ".[gpu]"`

Finally, if using gated models such `llama3`, make sure to set the `HF_TOKEN` environment variable to your Hugging Face token:

```python
os.environ["HF_TOKEN"] = "<my_token>"
```


In [None]:
# If we have multiple GPUs, we can use Ray to parallelize the inference.
# This is essential if you're runnng a model that's too big to fit in a single GPU.

import ray

if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
    ray.shutdown()
    ray.init(num_gpus=torch.cuda.device_count())

### Setting up the config file

Note: in this section we are writing the config file to the current working directory.

An alternative option is to initialize the params classes directly: `ModelParams`, `GenerationParams`.

In [4]:
config_path = "llama70b_inference_config.yaml"

In [None]:
%%writefile llama70b_inference_config.yaml

model:
  model_name: "meta-llama/Meta-Llama-3.1-8B-Instruct"  # 8B model, requires 1x A100-40GB GPUs
#   model_name: "meta-llama/Meta-Llama-3.1-70B-Instruct"  # 70B model, requires 4x A100-40GB GPUs
#   model_name: "bartowski/Meta-Llama-3.1-70B-Instruct-GGUF"  # 4-bit quantized model, requires 1x A100-40GB GPUs. See bonus section for more details.
  model_max_length: 512
  torch_dtype_str: "bfloat16"
  trust_remote_code: True
  attn_implementation: "flash_attention_2"

generation:
  max_new_tokens: 128
  batch_size: 1

### Load the model and the inference engine

In [None]:
%%time

# Downdload, and load the model in memory
# This may take a while, depending on your internet speed.
# The inference engine only needs to be loaded once and can be
# reused for multiple conversations.

config = InferenceConfig.from_yaml(config_path)

inference_engine = VLLMInferenceEngine(
    config.model,
    tensor_parallel_size=torch.cuda.device_count(),  # use all available GPUs
    # enable prefix caching for vLLM.
    # This is key for performance when running prompts with a long prefix,
    # such as judging or conversations with large system prompts
    # or few-shot examples.
    enable_prefix_caching=True,
)

### Preprocessing our inputs

The inference engine expects a list of conversations, where each conversation is a list of messages.

See the [Conversation](https://github.com/oumi-ai/oumi/blob/38b3d2b27407be5fc9be5a1dd88f9ad518f3491c/src/oumi/core/types/turn.py#L109) class for more details.

Tip: you can visualize how the conversation is rendered as a prompt with the following:

```python
inference_engine.apply_chat_template(conversation, tokenize=False)
```

In [None]:
conversations = [
    Conversation(
        messages=[
            Message(
                role=Role.SYSTEM, content="Translate the following text into French."
            ),
            Message(role=Role.USER, content="Hello, how are you?"),
        ]
    ),
]

### Running inference

Under the hood, the vLLM engine will batch the conversations to run inference with a high throughput.

Make sure to feed all your prompts to the engine at once for maximum throughput.

In [None]:
%%time

print(f"Running inference for {len(conversations)} conversations")

generations = inference_engine.infer(
    input=conversations,
    generation_config=config.generation,
)

For convenience, we also have the following function available:

```python
inference_engine.infer_from_file(input_filepath="path/to/file.json", generation_config=config.generation)
```

### Bonus: Running quantized GGUF models

You can also run quantized GGUF models, by downloading the model file and passing it to the engine.

For example, to run the 70B Meta Llama 3.1 model quantized at 4-bit, you can do the following: 

First, we download the GGUF model file. There are multiple quantization schemes available, here we choose the `Q4_K_S` scheme which is 4-bit with the `K_S` quantization algorithm.

In [None]:
from huggingface_hub import hf_hub_download

repo_id = "bullerwins/Meta-Llama-3.1-70B-Instruct-GGUF"
filename = "Meta-Llama-3.1-70B-Instruct-Q4_K_S.gguf"

# will download the model in the current working directory instead of HF_CACHE_DIR
model_path = hf_hub_download(repo_id, filename=filename, local_dir=".")

We then update the config file to point to the model we just downloaded:

In [None]:
%%writefile llama70b_inference_config.yaml

model:
  # Filepath to the GGUF model, which we just downloaded, see `model_path` output above
  model_name: "Meta-Llama-3.1-70B-Instruct-Q4_K_S.gguf"  
  # GGUF files do not have a config. We need to specify the tokenizer name manually.
  tokenizer_name: "meta-llama/Meta-Llama-3.1-70B-Instruct"  
  model_max_length: 512
  torch_dtype_str: "float16"  # GGUF models require float16
  trust_remote_code: True
  attn_implementation: "flash_attention_2"

generation:
  max_new_tokens: 128
  batch_size: 1