Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions ads/aqua/modeldeployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,35 +1288,42 @@ def validate_deployment_params(

def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
"""
For the CLI (set generate_table = True), generates the table (in rich diff) with valid
For the CLI (set by default, generate_table = True), generates the table (in rich diff) with valid
GPU deployment shapes for the provided model and configuration.

For the API (set generate_table = False), generates the JSON with valid
GPU deployment shapes for the provided model and configuration.

Validates if recommendations are generated, calls method to construct the rich diff
table with the recommendation data.
Validates the input and determines whether recommendations are available.

Parameters
----------
model_ocid : str
OCID of the model to recommend feasible compute shapes.
**kwargs
model_ocid : str
(Required) The OCID of the model to recommend feasible compute shapes for.
generate_table : bool, optional
If True, generate and return a rich-diff table; if False, return a JSON response (default is False).
compartment_id : str, optional
The OCID of the user's compartment to use for the recommendation.

Returns
-------
Table (generate_table = True)
A table format for the recommendation report with compatible deployment shapes
or troubleshooting info citing the largest shapes if no shape is suitable.
If `generate_table` is True, a table displaying the recommendation report with compatible deployment shapes,
or troubleshooting info if no shape is suitable.

ShapeRecommendationReport (generate_table = False)
A recommendation report with compatible deployment shapes, or troubleshooting info
citing the largest shapes if no shape is suitable.
If `generate_table` is False, a structured recommendation report with compatible deployment shapes,
or troubleshooting info and citing the largest shapes if no shape is suitable.

Raises
------
AquaValueError
If model type is unsupported by tool (no recommendation report generated)
If the model type is unsupported and no recommendation report can be generated.
"""
deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add a validation here to check if kwargs has model id before passing to get_deployment_config.

kwargs["deployment_config"] = deployment_config

try:
request = RequestRecommend(**kwargs)
except ValidationError as e:
Expand Down
17 changes: 17 additions & 0 deletions ads/aqua/shaperecommend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@
"4bit": ["No smaller quantization available"],
}

RUNTIME_WEIGHTS = {
"use_bfloat16": "bfloat16",
"use_fp16": "float16",
"use_fp32": "float32",
"use_int8": "int8",
"use_int4": "int4",
"use_bfloat32": "bfloat32",
}

TEXT_GENERATION = "text_generation"
SAFETENSORS = "safetensors"
Expand Down Expand Up @@ -78,14 +86,23 @@

IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization

VLLM_PARAMS_FAMILY = "VLLM_PARAMS"
VLLM_ENV = "VLLM"

QUANT_FLAG = "--quantization"
WEIGHT_DTYPE_FLAG = "--dtype"
MAX_MODEL_LEN_FLAG = "--max-model-len"

TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "

VLLM_PARAMS = {
"max_model_len": "--max-model-len",
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
"trust_remote_code": "--trust-remote-code",
}

DEFAULT_WEIGHT_SIZE = "float32"
DEFAULT_MAX_SEQ_LEN = 4096

BITS_AND_BYTES_8BIT = "8bit"
BITS_AND_BYTES_4BIT = "4bit"
Expand Down
96 changes: 57 additions & 39 deletions ads/aqua/shaperecommend/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
"""
seq_len = self.seq_len or self.llm_config.max_seq_len
c = self.llm_config
llm_config = self.llm_config
kv_cache_dtype_bytes = QUANT_MAPPING.get(
c.weight_dtype, 2
llm_config.weight_dtype, 2
) # vLLM uses model's weight applied to KV cache

total_bytes = (
self.batch_size
* c.num_hidden_layers
* llm_config.num_hidden_layers
* 2
* c.num_attention_heads
* llm_config.num_attention_heads
* seq_len
* c.head_dim
* llm_config.head_dim
* kv_cache_dtype_bytes
)
return total_bytes / 1e9
Expand All @@ -69,15 +69,17 @@ def model_memory(self) -> float:

Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
"""
c = self.llm_config
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
llm_config = self.llm_config
Copy link
Member

@VipulMascarenhas VipulMascarenhas Sep 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: we could just do self.llm_config below instead of declaring a new var, just personal preference. We can keep it as is for now.

embedding_count = 1 if llm_config.tie_word_embeddings else 2
embedding_params = (
embedding_count * c.vocab_size * c.hidden_size
embedding_count * llm_config.vocab_size * llm_config.hidden_size
) # input and output untied
layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style
layer_params = (
12 * llm_config.num_hidden_layers * (llm_config.hidden_size**2)
) # GPT-style
num_params = layer_params + embedding_params

return num_params * c.bytes_per_parameter / 1e9
return num_params * llm_config.bytes_per_parameter / 1e9

@property
def total_memory(self) -> float:
Expand Down Expand Up @@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str:
-------
str: Parameter string for model deployment.
"""
c = self.llm_config
llm_config = self.llm_config
params = []
if self.seq_len < c.max_seq_len:
if self.seq_len < llm_config.max_seq_len:
params.append(VLLM_PARAMS["max_model_len"])
params.append(str(self.seq_len))

# Only suggest in-flight quantization for unquantized models when such quantization is requested
if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
if (
not llm_config.quantization
and llm_config.in_flight_quantization in IN_FLIGHT_QUANTIZATION
):
# vLLM only supports 4bit in-flight quantization
params.append(VLLM_PARAMS["in_flight_quant"])

# add trust-remote-code if custom modules are specified
if llm_config.trust_remote_code:
params.append(VLLM_PARAMS["trust_remote_code"])

params = " ".join(params) if params else ""
return params

Expand All @@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str:
wt_gb = self.model_memory
batch_size = self.batch_size
seq_len = self.seq_len
weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
weight_size = self.llm_config.weight_dtype
config = self.llm_config

suggested_quant_msg = None
quant_advice = ", ".join(config.suggested_quantizations)
quantization = getattr(config, "quantization", None)
quantization = config.quantization

advice = []

Expand Down Expand Up @@ -246,7 +255,7 @@ def limiting_factor(
)
else:
advice = (
f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
f"Model fits well within the allowed compute shape "
f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
)
return advice
Expand All @@ -268,22 +277,22 @@ def model_memory(self) -> float:
Returns estimated model parameter memory (in GB), accurately accounting
for Llama-style attention and MLP, and tied or untied embeddings.
"""
c = self.llm_config
llm_config = self.llm_config

embedding_params, attn_params = self._calc_attn_embed_params()

# MLP params
gate_proj = c.hidden_size * c.intermediate_size
up_proj = c.hidden_size * c.intermediate_size
down_proj = c.intermediate_size * c.hidden_size
gate_proj = llm_config.hidden_size * llm_config.intermediate_size
up_proj = llm_config.hidden_size * llm_config.intermediate_size
down_proj = llm_config.intermediate_size * llm_config.hidden_size
mlp_params = gate_proj + up_proj + down_proj

# Total per-layer
layer_params = attn_params + mlp_params
# Total params
num_params = c.num_hidden_layers * layer_params + embedding_params
num_params = llm_config.num_hidden_layers * layer_params + embedding_params

return num_params * c.bytes_per_parameter / 1e9
return num_params * llm_config.bytes_per_parameter / 1e9

@property
def kv_cache_memory(self) -> float:
Expand All @@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float:
Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
"""
c = self.llm_config
seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
kv_heads = c.num_key_value_heads
llm_config = self.llm_config
seq_len = self.seq_len or llm_config.max_seq_len
kv_cache_dtype_bytes = QUANT_MAPPING.get(llm_config.weight_dtype, 2)
kv_heads = llm_config.num_key_value_heads

total_bytes = (
self.batch_size
* c.num_hidden_layers
* llm_config.num_hidden_layers
* 2
* kv_heads
* seq_len
* c.head_dim
* llm_config.head_dim
* kv_cache_dtype_bytes
)
return total_bytes / 1e9
Expand All @@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple:
"""
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
"""
c = self.llm_config
llm_config = self.llm_config

# Embedding parameters
# assume tied embeddings unless tie_word_embeddings = False
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
embedding_params = embedding_count * c.vocab_size * c.hidden_size
embedding_count = 1 if llm_config.tie_word_embeddings else 2
embedding_params = (
embedding_count * llm_config.vocab_size * llm_config.hidden_size
)

q_proj = c.hidden_size * c.hidden_size
k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
o_proj = c.hidden_size * c.hidden_size
q_proj = llm_config.hidden_size * llm_config.hidden_size
k_proj = llm_config.hidden_size * (
llm_config.num_key_value_heads * llm_config.head_dim
)
v_proj = llm_config.hidden_size * (
llm_config.num_key_value_heads * llm_config.head_dim
)
o_proj = llm_config.hidden_size * llm_config.hidden_size
attn_params = q_proj + k_proj + v_proj + o_proj

return embedding_params, attn_params
Expand All @@ -342,21 +357,24 @@ def model_memory(self) -> float:

Returns the estimated memory size of the MoE Model (in GB).
"""
c = self.llm_config
llm_config = self.llm_config
# Attention parameter count (Llama-style)
embedding_params, attn_params = self._calc_attn_embed_params()

# MoE MLP params per layer
moe_params_per_layer = (
c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
llm_config.num_local_experts
* 3
* llm_config.hidden_size
* llm_config.intermediate_size
)
total_params = (
c.num_hidden_layers * (attn_params + moe_params_per_layer)
llm_config.num_hidden_layers * (attn_params + moe_params_per_layer)
+ embedding_params
)

# Convert to GB
return total_params * c.bytes_per_parameter / 1e9
return total_params * llm_config.bytes_per_parameter / 1e9


def get_estimator(llm_config, **kwargs) -> MemoryEstimator:
Expand Down
Loading