In [None]:
# Just some tricks to look like we are running comands from the root directory
import os
import sys
from pathlib import Path

# Add parent directory to system path
root_dir = Path(os.getcwd()).parent
# Absolute path
root_dir = root_dir.resolve()
sys.path.append(root_dir)

# Change the working directory to the project root
os.chdir(root_dir)
print("Working directory set to:", os.getcwd())

In [7]:
# Enable the autoreload extension
%load_ext autoreload
%autoreload 2

## 1) Overview

The idea of th `llms` package is to make it very simple to send inference to LLMs. 
In summary: it allows inference calls to any of the supported providers and engines with something as simple as: `messages=list[images, text, video, function_call, ...]` and `generation_configs = {k:v, k:v,..}`. 


And behind the scenes, the package handles:
- Formatting of prompts for providers and models.
- Setting up clients, inference engines for HuggingFace
- Routing the inference call to the appropriate provider and engine
- Load balancing API keys
- Retry with exponential backoff with customized error logic
- Logging of inputs and outputs, including: HTML visualization of the prompt for debugging; token counts; conversation logs
- Return of multimodal outputs in a unified format independent of the provider, model, or provider mode.
- Type validation and handling of multiple media types
- Etc, etc

## 2) Setting API keys

Two alternatives:

#### 1) Set the API keys in the environment variables.
```bash
export OPENAI_API_KEY="<key >"
export GOOGLE_API_KEY="<key>"
export HF_TOKEN="<key>
```
In this option, **no load balancing** of the keys is performed in case of multiple calls.

#### 2)(Recommended) Create a `api_keys.json` file  as follows:

api_keys.json
```json
{
    "google": ["key1", "key2", "..."],
    "openai": ["key1", "key2", "..."],
    "huggingface": ["key1", "key2", "..."]
}
```

With this option, a load balancing of the keys will be performed and in case of quota limit errors, other keys will be tried.

**NOTE**: Keep also a `api_keys_repo.json` as a backup. 

Why: In case of concurrent processes, they will fetch keys from `api_keys.json` and remove keys for load balancing purposes. The code has signal handlers to return them to the files, but something can go wrong and prevent that.

## 3) Call LLM

`call_llm` is the main function of interest. It receives: "messages" to send to the model and "generation config" to specifiy: (i) the model's behavior (ii) the inference proviers/engines.

Below a series of examples on how to use it.

In [None]:
from llms.llm_utils import call_llm, get_gen_config_fields, visualize_prompt
from PIL import Image

#### Call to OpenAI, Google Model

Below is one example of how the inputs can be provided for inference call. 

There are many variations possible. This file will include more in the future

In [13]:
inputs=[
        {"role": "system", "text": "You are an intelligent and helpful assistant."}, # dict with a role and text
        {"role": "user", "name": "example_user", "inputs": ["Example input 1","./examples/cat.png"]}, # a dict with user role, name and inputs
        {"role": "assistant", "name": "example_assistant", "contents": ["Example input 2","./examples/dog.png"]}, # a dict with assistant role, name and inputs
        "Describe **all** the below items.", # raw string
        ["Item (1):", "./examples/cat.png"], # A list with a prefix text and a file to an image (both are sent in the same message)
        ["Item (2):", Image.open("./examples/dog.png")], # A list with a prefix text and a PIL image (both are sent in the same message)
        ["Item (3):", "Once upon a time, there was a princess who lived in a castle."], # A list with only a text input
        "Provide your response as follows: <Title for Item 1> <Description for Item 1> <Title for Item 2> <Description for Item 2> <Title for Item 3> <Description for Item 3>"
  ]

**NOTEs:** 
 - Each of the entries in `inputs` is a `Message`. If that sounds confusing / ambiguous, check the file `llms.types.py` or continue reading.
 - In short: LLMs receive a series of `Message` objects, where a `Message` contains multiple raw inpus (such as images, text and video). The full prompt is a list of those messages.
 - We did not define a role for some entries (e.g.: `Item(1): ...`). These will default to `user` role. This behavior can be changed by providing the input as in the dictionary example or by creating a list of `Message` objects directly (see section (4)).

You can visualize how the prompt will look like by using the `visualize_prompt` function.

In [None]:
# this will save an .html file with the prompt as it will be sent to the model.
visualize_prompt(messages=inputs, output_path="./examples/vis.html") 

# # To visualize in the jupyter notebook, run:
# from IPython.display import display, HTML

# # Read the HTML content from the file
# with open("llms/examples/vis.html", "r") as file:
#     html_content = file.read()

# # Display the content inline in the notebook
# display(HTML(html_content))

We now define a minimum set of generation configs and call Gemini:

In [None]:

gen_args = {
    "model": "gemini-2.0-flash-001",
    "temperature": 0.5,
    "max_tokens": 1000,
    "top_p": 0.95,
    "top_k": 40,
    "num_generations": 1,
}

# If these are provided, HTML log of the conversation and CSV logs of the usage will be saved in the given directories
conversation_dir = "./examples/conversation" # HTML and txt logs of the conversation saved here
usage_dir = "./examples/usage" # CSV logs of the usage saved here
call_id = "test_call" # If provided, the logs will be named like "./examples/conversation/test_call.html", "./examples/usage/test_call.csv", etc

response, model_generations = call_llm(gen_args, inputs, conversation_dir=conversation_dir, usage_dir=usage_dir, call_id=call_id)    

#NOTE: we did not specify a `role` for some of the inputs. 
# In this case, the `role` on any message not specified will be set to `user`.

# You can specify the role in the dictionary way as above or by creating list `Message` objects as explained in (4).

After this call, we have
- A list of `response` objects; these are dictionaries with data about the API request
- A list of `model_generations`; these are `Message` objects containing the model's raw outputs (text, images, etc).
- `html` and `txt` logs of the conversation round in the `llms/examples/conversation` directory
- `csv` files with token usage information in the `llms/examples/usage` directory

**NOTE1**: For more of the `gen_args` available:
- Please check `llms.generation_config.py` for more details on each one. Examples here will illustrate the main ones
- The command below lists all possible parameters to control model behavior setting up providers/engines, but output is not pretty and some parameters are provider/engine specific. 

In [None]:
get_gen_config_fields()

**NOTE 2:** Output format and accessing raw inputs

Print below to see how the output is returned.

The `Message` object is a unified format for both **inputs** and **outputs** of a user-model conversation. More details of it in `llms.types`, but in summary:
- A single message contains: (i) a `role` that identifies the entity sending the information; (ii) data ('text', 'images', etc) sent by the entity
- A conversation is a list of `Message` items.

In [None]:
# Just to see how the returned objects look like
print(model_generations)
print(response)

Below some methods to access the `Message` raw data:

In [None]:
# Get all text content within a message
print(model_generations[0].text())

In [None]:
# Get all images within a message (there is none in this case because this model outputs only text)
model_generations[0].images()

In [None]:
# A list with interleaved text, image, video, etc.
model_generations[0].raw_data()

In [None]:
# A dict with format similar to OpenAI chat completion format
model_generations[0].to_dict()

##### Continuing the conversation + calling a new model 

Now suppose we want to send another query with the previous inputs + the model response + a new request.

To make things more interesting, lets send this to **GPT4o** now.

Below we construct this new input using the previous list of `inputs` and showing some new ways of providing inputs

In [15]:
# Construct the new inputs
new_inputs = inputs + [
    model_generations[0], # The Message object can be sent directly as input too; notice it contains the ROLE of the entity.
    {"role": "user", "text": "Please give an opinion of the above conversation. How do you evaluate the assistant's performance?"}
]

You can visualize the prompt before sending for a sanity check by using the `visualize_prompt` tool.

In [16]:
output_path = "llms/examples/vis.html"
visualize_prompt(new_inputs, output_path)
# This commands save an `.html` file with the messsages as they will be received by the model. 
# Open it in a browser for visualization and to sanity check if the order of messages, roles, entitiy names, etc is correct. 

After making sure the prompt is correct, we can send it to GPT4o. 
- For that, we only need to change the `model` parameter in the previous generation arguments.
- There is no need to adjust parameter names or values to abide to the new provider. 
- Same thing for the prompt formats


In [None]:
# We only change the model name in the generation config.
gen_args["model"] = "gpt-4o-2024-08-06"
# Let's also ask for 2 generations
gen_args["num_generations"] = 2

# You can add a `call_id` to save the conversation and usage logs with a specific name.
# in this case: "./examples/conversation/gpt4o_call.html", "./examples/conversation/gpt4o_call.txt", "./examples/usage/gpt4o_call.csv", etc
response, model_generations = call_llm(gen_args, new_inputs, conversation_dir=conversation_dir, usage_dir=usage_dir, call_id="gpt4o_call")

In [None]:
# Print the models answers
for msg in model_generations:
    print(msg.text())

**TODO**: add more examples:
- OpenAI's `response` API
- Batch generation
- Other variations for input formats and dictionary keys
- Multimodal outputs

### Call HuggingFace

Same process to call models from HuggingFace's, except that:

- (i) We need to specify some more arguments such as "engine" to deploy the model
    - Supported engines are: `automodel`, `server`, `vllm` and `openai`. Details below.
- (ii) There is a higher likelihood of bugs; many models in HuggingFace have model-specific quirks and it is impossible to foresee all them.
    - The code will do the best effort to process the inputs and generate the outputs. But for instance, `Qwen-2.5-VL` was not supported by the `Automodel` class so there is a specific handling of model loading and generation that is hard to automate. 
    - Moreover, some models have specific prompts that are not always covered by the `apply_chat_template`. 
    - etc
- We can also specify other args like: which resources to use (e.g.: CPU, GPU, etc); if quantize or not; etc. See `llms.generation_config.py` for all HF-specifc args.

Below examples make an inference call to `Qwen-2.5-VL-3B` using the three engines. 

Below is the same `inputs` as above, but with other examples of ways to send each input.

In [3]:
inputs=[
        {"role": "system", "text": "You are an intelligent and helpful assistant."}, 
        "Describe **all** the below items.",
        {"role": "user", "text": "Item (1):", "image": "llms/examples/cat.png"}, # Another way to send an input
        {"role": "user", "contents":[{"type": "text", "text": "Item (2):"}, {"type": "image", "image": "llms/examples/dog.png"}]}, #OpenAI chat completion format
        "Item (3): Once upon a time, there was a princess who lived in a castle."
        "Provide your response as follows: <Title for Item 1> <Description for Item 1> <Title for Item 2> <Description for Item 2> <Title for Item 3> <Description for Item 3>"
  ]


In [None]:
# Run this cell to visualize the prompt

visualize_prompt(inputs, "llms/examples/vis_hf.html")
# Read the HTML content from the file
with open("llms/examples/vis.html", "r") as file:
    html_content = file.read()

# Display the content inline in the notebook
display(HTML(html_content))

#### Hugging Face - Automodel Engine


This mode is the same as the vanilla usage of hugging face; the model is available only to the current process.

In [None]:
gen_args = {
    "model": "Qwen/Qwen2.5-VL-3B-Instruct",
    "engine": "automodel",
    "num_generations": 1,
    "temperature": 0.5,
    "max_tokens": 1000,
    "top_p": 0.95,
    "top_k": 40,
    "repetition_penalty": 1.05,

    # "flash_attn": True,    # Code will automatically try to use if available
    # "torch_dtype": "auto", # Code will automatically choose based on model info
    # "device": "auto",      # Code will determine based on machine and other params. Typically sets to 'auto'
    # "quant_bits": "int8",  # Quantizes to int8 if supported by model. "int4" also supported.
}
conversation_dir = "llms/examples/conversation"
usage_dir = "llms/examples/usage"
responses, model_generations = call_llm(gen_args, inputs, conversation_dir=conversation_dir, usage_dir=usage_dir)

NOTES:
- By default: 
    - `device_map=auto`. Set `device:<device>` to override. #TODO: allow dict with `device_map`;
    - Use `flash_attn` if it is available. To disable, set `flash_attn:False`
    - Set `dtype` based on the model information and if not found, it set to `auto`. Set `dtype` to override.
- Behind the scenes, the prompts are converted to an OpenAI chat completions format that HF uses. Check them via `responses[idx]["prompt"]`

In [None]:
model_generations[0].text()

#### Hugging Face - Local Server Engine

The `server` engine makes model available at an `endpoint`, so multiple processes can send inference requests without using multiple GPUs.

There are two ways to deploy in this mode:

**Option 1: (Recommeded) Host the model first, then send inference calls with `call_llm`**


1. Run:

 ```bash
 python -m llms.providers.hugging_face.host_model_hf "Qwen/Qwen2.5-VL-3B-Instruct" --host <host> --port <port>
 ```

2. Add `engine:server` and `<host>:<port>` in `gen_args`

**NOTE**: If hosting in `machineA` and accessing model via `machineB`: execute step 1 in machineA; to `call_llm` from `machineB`, set `host` to the IP of machineA.


**Option 2:  Directly call `call_llm` with `engine:server` and `localhost:<port>` in `gen_args`.**
- This will automatically host the model if possible, using the same script `llms.providers.hugging_face.host_model_hf`
- It is less recommended as:
    - The process hosting the model will die if the first process that calls `call_llm` ends
    - For new models, weights will be downloaded; the code wait for the server to start, but it can take a while and you may get false positives saying server was unable to start.
    - All kinds of problems if there are concurrent processes that need to wait for the same server to start
- Use this mostly to prototype using single process. Do not use for concurrent execution.


`call_llm` example:

In [None]:
# Suppose we ran:
# python -m llms.providers.hugging_face.host_model_hf "Qwen/Qwen2.5-VL-3B-Instruct" --host localhost --port 8000

# Then we can send inference to this server by adding these args in `gen_args`:
gen_args = {
    "model": "Qwen/Qwen2.5-VL-3B-Instruct",
    "num_generations": 1,
    "temperature": 0.5,
    "max_tokens": 1000,
    "top_p": 0.95,
    "top_k": 40,
    "repetition_penalty": 1.05,
    "engine": "server",  # <--------- CHANGED `automodel` to `server`
    "endpoint": "localhost:8000"  # <--------- ADDED
}

# No need for any change in the inputs.

response, model_generations = call_llm(gen_args, inputs, conversation_dir=conversation_dir, usage_dir=usage_dir)

#### Hugging Face - VLLM Engine

The `vllm` makes model available to receive requests at `endpoint`, so multiple processes can send inference requests without using multiple GPUs.

NOTES:
- The idea is the same as `server`, but in this case the server is handled by `vllm`
- `vllm` has non-trivial optimization to handle concurrent calls. May be a better option in cases of high demand for the server.
- Issue: `vllm` tends to consume a lot of GPU memory to realize its optimizations. 
    - You may run out of memory even for models that are typically possible to load with vanilla automodel.
    - In these cases, try to increase `--gpu-mem` (between 0 and 1), do not pass `--enforce-eager` (set to false), and reduce `--max-model-len`.


There are two ways to deploy in this mode:



**Option 1: Host the model first, then send inference calls with `call_llm`**

1. Run 

```bash
python -m llms.providers.hugging_face.host_model_vllm <model_id> --host <host> --port <port> --num-gpus <num_gpus> --max-model-len <max_model_len>` 
# (check all params using -h)
```

2. Add `engine:vllm` and `<host>:<port>` in `gen_args`

**NOTE**: If hosting in `machineA` and accessing model via `machineB`: execute step 1 in machineA; to `call_llm` from `machineB`, set `host` to the IP of machineA.

**Option 2: Directly call `call_llm` with `engine:vllm` and `<host>:<port>` in `gen_args`.**
- All the warnings from the `server` case apply here too.

`call_llm` example:

In [None]:
# Obs.: needs to have an API key for the corresponding provider.

# Then we can send inference to this server by adding these args in `gen_args`:
gen_kwargs = {
    "model": "qwen/qwen2.5-vl-72b-instruct:free",
    "engine": "openai",
    "metadata": {
        "base_url": "https://openrouter.ai/api/v1",
        "provider": "openrouter",
    },
    "num_generations": 1,
    "temperature": 1.0,
    "top_p": 0.95,
    "top_k": 40,
    "max_tokens": 256,
}

# No need for any change in the inputs.
response, model_generations = call_llm(gen_args, inputs, conversation_dir=conversation_dir, usage_dir=usage_dir)

### Hugging Face - thirdy-party providers that use OpenAI client

## 4) Prompting and get_messages

In [4]:
from llms.prompt_utils import get_messages, get_message

The functions `get_messages` and `get_message` gives more fine-grained control to send the prompts. 
- Obs.: Anything can also be done via the flexible list of inputs used in (3).

The function `get_message` creates a single `Message` object given:
- `inputs`: list of raw data in flexible format (same way as given to `call_llm` as explained in (3))
- `role`: of the entity responsible for the message
- `name` of the entity responsible for the message
- `img_detail`: for providers that support, defines how much details to apply to the image

`get_messages` Is the same thing, but gives you a list of Message objects instead. It also allows:
- to give the `sys_prompt` via an argument as well.
- concatenate consecutive texts into one `Message` by setting `concatenate_text=True`

Consider the same `inputs` as before. We can create a list of Message objects from it as below. This is exactly what `call_llm` does behind the scenes.

In [None]:
inputs=[
        {"role": "system", "text": "You are an intelligent and helpful assistant."}, 
        "Describe **all** the below items.",
        {"role": "user", "text": "Item (1):", "image": "llms/examples/cat.png"}, # Another way to send an input
        {"role": "user", "contents":[{"type": "text", "text": "Item (2):"}, {"type": "image", "image": "llms/examples/dog.png"}]}, #OpenAI chat completion format
        "Item (3): Once upon a time, there was a princess who lived in a castle."
        "Provide your response as follows: <Title for Item 1> <Description for Item 1> <Title for Item 2> <Description for Item 2> <Title for Item 3> <Description for Item 3>"
  ]

# Create a list of Message objects from the inputs
messages = get_messages(inputs)

messages

Examples:

In [None]:
# Create a message object with higher image detail; note we can also give a `name` to the user (some providers support it)
msg_ex_user = get_message(["Item (1):", "llms/examples/cat.png"], role="user", name="example_user", img_detail="high")

# Create an ASSISTANT message; note we can also give a `name` to the assistant (some providers support it)
msg_ex_assistant = get_message(["This is a cat"], role="assistant", name="example_assistant")

# Create a SYSTEM message
msg_system = get_message("You are an intelligent and helpful assistant.", role="system")


# get a full prompt to send to the model
get_messages(
    [
        msg_system,
        msg_ex_user,
        msg_ex_assistant,
        {"role": "user", "contents":[{"type": "text", "text": "Item (2):"}, {"type": "image", "image": "llms/examples/dog.png"}]}, #OpenAI chat completion format
        "Item (3): Once upon a time, there was a princess who lived in a castle."
        "Please describe the new items in the conversation."
    ],
    concatenate_text=True, # Concatenate consecutive texts into one `Message`. Note the last two are all in the same message.
    role="user", # This role is applied to all messages without a role. (e.g.: last two)
    name="user", # This name is applied to all messages without a name. (e.g.: last two)
)


## 5) Batch Call LLM

Basically sends a batch of messages to MLLM and receives a batch of responses.
- For API providers, parallel calls are performed behind the scenes
- For HF, uses the usual inference with tensors. 
- Just give a long list of prompts and set `max_batch_size` to control how many messages are sent to the model per time

In [None]:
from llms.llm_utils import batch_call_llm

# TODO: add examples / explanation

gen_args = {
    "model": "gpt-4o-2024-08-06",
    "temperature": 0.5,
    "max_tokens": 1000,
    "top_p": 0.95,
}

msgs1= [
    {"role": "system", "text": "You are an intelligent and helpful assistant."},
    {"role": "user", "text": "Item (1):", "image": "llms/examples/cat.png"},
    {"role": "user", "text": "Item (2):", "image": "llms/examples/dog.png"},
    {"role": "user", "text": "Item (3):", "image": "llms/examples/cat.png"},
    {"role": "user", "text": "Item (4):", "image": "llms/examples/dog.png"},
    {"role": "user", "text": "Item (5):", "image": "llms/examples/cat.png"},
]

msgs2= [
    {"role": "system", "text": "You are an intelligent and helpful assistant."},
    {"role": "user", "text": "Item (1):", "image": "llms/examples/cat.png"},
    {"role": "user", "text": "Item (2):", "image": "llms/examples/dog.png"},
]
conversation_dirs = ["./conversation_dir1", "./conversation_dir2"] 
usage_dirs = ["./usage_dir1", "./usage_dir2"]
call_ids = ["call1", "call2"]   
batch_call_llm(gen_kwargs=gen_args, 
               messages=[msgs1, msgs2], 
               max_batch_size=10, # Max number of messages to send in each batch
               conversation_dirs=conversation_dirs, 
               usage_dirs=usage_dirs,
               call_ids=call_ids
)

