Skip to content

Commit ed095fd

Browse files
authored
[WIP- FINAL][AQUA][GPU Shape Recommendation] Support for Service Managed Models #1252 (#1267)
1 parent 054b2fc commit ed095fd

21 files changed

+2168
-224
lines changed

ads/aqua/modeldeployment/deployment.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,35 +1288,42 @@ def validate_deployment_params(
12881288

12891289
def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
12901290
"""
1291-
For the CLI (set generate_table = True), generates the table (in rich diff) with valid
1291+
For the CLI (set by default, generate_table = True), generates the table (in rich diff) with valid
12921292
GPU deployment shapes for the provided model and configuration.
12931293
12941294
For the API (set generate_table = False), generates the JSON with valid
12951295
GPU deployment shapes for the provided model and configuration.
12961296
1297-
Validates if recommendations are generated, calls method to construct the rich diff
1298-
table with the recommendation data.
1297+
Validates the input and determines whether recommendations are available.
12991298
13001299
Parameters
13011300
----------
1302-
model_ocid : str
1303-
OCID of the model to recommend feasible compute shapes.
1301+
**kwargs
1302+
model_ocid : str
1303+
(Required) The OCID of the model to recommend feasible compute shapes for.
1304+
generate_table : bool, optional
1305+
If True, generate and return a rich-diff table; if False, return a JSON response (default is False).
1306+
compartment_id : str, optional
1307+
The OCID of the user's compartment to use for the recommendation.
13041308
13051309
Returns
13061310
-------
13071311
Table (generate_table = True)
1308-
A table format for the recommendation report with compatible deployment shapes
1309-
or troubleshooting info citing the largest shapes if no shape is suitable.
1312+
If `generate_table` is True, a table displaying the recommendation report with compatible deployment shapes,
1313+
or troubleshooting info if no shape is suitable.
13101314
13111315
ShapeRecommendationReport (generate_table = False)
1312-
A recommendation report with compatible deployment shapes, or troubleshooting info
1313-
citing the largest shapes if no shape is suitable.
1316+
If `generate_table` is False, a structured recommendation report with compatible deployment shapes,
1317+
or troubleshooting info and citing the largest shapes if no shape is suitable.
13141318
13151319
Raises
13161320
------
13171321
AquaValueError
1318-
If model type is unsupported by tool (no recommendation report generated)
1322+
If the model type is unsupported and no recommendation report can be generated.
13191323
"""
1324+
deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
1325+
kwargs["deployment_config"] = deployment_config
1326+
13201327
try:
13211328
request = RequestRecommend(**kwargs)
13221329
except ValidationError as e:

ads/aqua/shaperecommend/constants.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@
3838
"4bit": ["No smaller quantization available"],
3939
}
4040

41+
RUNTIME_WEIGHTS = {
42+
"use_bfloat16": "bfloat16",
43+
"use_fp16": "float16",
44+
"use_fp32": "float32",
45+
"use_int8": "int8",
46+
"use_int4": "int4",
47+
"use_bfloat32": "bfloat32",
48+
}
4149

4250
TEXT_GENERATION = "text_generation"
4351
SAFETENSORS = "safetensors"
@@ -78,14 +86,23 @@
7886

7987
IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization
8088

89+
VLLM_PARAMS_FAMILY = "VLLM_PARAMS"
90+
VLLM_ENV = "VLLM"
91+
92+
QUANT_FLAG = "--quantization"
93+
WEIGHT_DTYPE_FLAG = "--dtype"
94+
MAX_MODEL_LEN_FLAG = "--max-model-len"
95+
8196
TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
8297

8398
VLLM_PARAMS = {
8499
"max_model_len": "--max-model-len",
85100
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
101+
"trust_remote_code": "--trust-remote-code",
86102
}
87103

88104
DEFAULT_WEIGHT_SIZE = "float32"
105+
DEFAULT_MAX_SEQ_LEN = 4096
89106

90107
BITS_AND_BYTES_8BIT = "8bit"
91108
BITS_AND_BYTES_4BIT = "4bit"

ads/aqua/shaperecommend/estimator.py

Lines changed: 57 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
4646
Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
4747
"""
4848
seq_len = self.seq_len or self.llm_config.max_seq_len
49-
c = self.llm_config
49+
llm_config = self.llm_config
5050
kv_cache_dtype_bytes = QUANT_MAPPING.get(
51-
c.weight_dtype, 2
51+
llm_config.weight_dtype, 2
5252
) # vLLM uses model's weight applied to KV cache
5353

5454
total_bytes = (
5555
self.batch_size
56-
* c.num_hidden_layers
56+
* llm_config.num_hidden_layers
5757
* 2
58-
* c.num_attention_heads
58+
* llm_config.num_attention_heads
5959
* seq_len
60-
* c.head_dim
60+
* llm_config.head_dim
6161
* kv_cache_dtype_bytes
6262
)
6363
return total_bytes / 1e9
@@ -69,15 +69,17 @@ def model_memory(self) -> float:
6969
7070
Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
7171
"""
72-
c = self.llm_config
73-
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
72+
llm_config = self.llm_config
73+
embedding_count = 1 if llm_config.tie_word_embeddings else 2
7474
embedding_params = (
75-
embedding_count * c.vocab_size * c.hidden_size
75+
embedding_count * llm_config.vocab_size * llm_config.hidden_size
7676
) # input and output untied
77-
layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style
77+
layer_params = (
78+
12 * llm_config.num_hidden_layers * (llm_config.hidden_size**2)
79+
) # GPT-style
7880
num_params = layer_params + embedding_params
7981

80-
return num_params * c.bytes_per_parameter / 1e9
82+
return num_params * llm_config.bytes_per_parameter / 1e9
8183

8284
@property
8385
def total_memory(self) -> float:
@@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str:
120122
-------
121123
str: Parameter string for model deployment.
122124
"""
123-
c = self.llm_config
125+
llm_config = self.llm_config
124126
params = []
125-
if self.seq_len < c.max_seq_len:
127+
if self.seq_len < llm_config.max_seq_len:
126128
params.append(VLLM_PARAMS["max_model_len"])
127129
params.append(str(self.seq_len))
128130

129131
# Only suggest in-flight quantization for unquantized models when such quantization is requested
130-
if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
132+
if (
133+
not llm_config.quantization
134+
and llm_config.in_flight_quantization in IN_FLIGHT_QUANTIZATION
135+
):
131136
# vLLM only supports 4bit in-flight quantization
132137
params.append(VLLM_PARAMS["in_flight_quant"])
133138

139+
# add trust-remote-code if custom modules are specified
140+
if llm_config.trust_remote_code:
141+
params.append(VLLM_PARAMS["trust_remote_code"])
142+
134143
params = " ".join(params) if params else ""
135144
return params
136145

@@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str:
154163
wt_gb = self.model_memory
155164
batch_size = self.batch_size
156165
seq_len = self.seq_len
157-
weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
166+
weight_size = self.llm_config.weight_dtype
158167
config = self.llm_config
159168

160169
suggested_quant_msg = None
161170
quant_advice = ", ".join(config.suggested_quantizations)
162-
quantization = getattr(config, "quantization", None)
171+
quantization = config.quantization
163172

164173
advice = []
165174

@@ -246,7 +255,7 @@ def limiting_factor(
246255
)
247256
else:
248257
advice = (
249-
f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
258+
f"Model fits well within the allowed compute shape "
250259
f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
251260
)
252261
return advice
@@ -268,22 +277,22 @@ def model_memory(self) -> float:
268277
Returns estimated model parameter memory (in GB), accurately accounting
269278
for Llama-style attention and MLP, and tied or untied embeddings.
270279
"""
271-
c = self.llm_config
280+
llm_config = self.llm_config
272281

273282
embedding_params, attn_params = self._calc_attn_embed_params()
274283

275284
# MLP params
276-
gate_proj = c.hidden_size * c.intermediate_size
277-
up_proj = c.hidden_size * c.intermediate_size
278-
down_proj = c.intermediate_size * c.hidden_size
285+
gate_proj = llm_config.hidden_size * llm_config.intermediate_size
286+
up_proj = llm_config.hidden_size * llm_config.intermediate_size
287+
down_proj = llm_config.intermediate_size * llm_config.hidden_size
279288
mlp_params = gate_proj + up_proj + down_proj
280289

281290
# Total per-layer
282291
layer_params = attn_params + mlp_params
283292
# Total params
284-
num_params = c.num_hidden_layers * layer_params + embedding_params
293+
num_params = llm_config.num_hidden_layers * layer_params + embedding_params
285294

286-
return num_params * c.bytes_per_parameter / 1e9
295+
return num_params * llm_config.bytes_per_parameter / 1e9
287296

288297
@property
289298
def kv_cache_memory(self) -> float:
@@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float:
293302
Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
294303
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
295304
"""
296-
c = self.llm_config
297-
seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
298-
kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
299-
kv_heads = c.num_key_value_heads
305+
llm_config = self.llm_config
306+
seq_len = self.seq_len or llm_config.max_seq_len
307+
kv_cache_dtype_bytes = QUANT_MAPPING.get(llm_config.weight_dtype, 2)
308+
kv_heads = llm_config.num_key_value_heads
300309

301310
total_bytes = (
302311
self.batch_size
303-
* c.num_hidden_layers
312+
* llm_config.num_hidden_layers
304313
* 2
305314
* kv_heads
306315
* seq_len
307-
* c.head_dim
316+
* llm_config.head_dim
308317
* kv_cache_dtype_bytes
309318
)
310319
return total_bytes / 1e9
@@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple:
313322
"""
314323
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
315324
"""
316-
c = self.llm_config
325+
llm_config = self.llm_config
317326

318327
# Embedding parameters
319328
# assume tied embeddings unless tie_word_embeddings = False
320-
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
321-
embedding_params = embedding_count * c.vocab_size * c.hidden_size
329+
embedding_count = 1 if llm_config.tie_word_embeddings else 2
330+
embedding_params = (
331+
embedding_count * llm_config.vocab_size * llm_config.hidden_size
332+
)
322333

323-
q_proj = c.hidden_size * c.hidden_size
324-
k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
325-
v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
326-
o_proj = c.hidden_size * c.hidden_size
334+
q_proj = llm_config.hidden_size * llm_config.hidden_size
335+
k_proj = llm_config.hidden_size * (
336+
llm_config.num_key_value_heads * llm_config.head_dim
337+
)
338+
v_proj = llm_config.hidden_size * (
339+
llm_config.num_key_value_heads * llm_config.head_dim
340+
)
341+
o_proj = llm_config.hidden_size * llm_config.hidden_size
327342
attn_params = q_proj + k_proj + v_proj + o_proj
328343

329344
return embedding_params, attn_params
@@ -342,21 +357,24 @@ def model_memory(self) -> float:
342357
343358
Returns the estimated memory size of the MoE Model (in GB).
344359
"""
345-
c = self.llm_config
360+
llm_config = self.llm_config
346361
# Attention parameter count (Llama-style)
347362
embedding_params, attn_params = self._calc_attn_embed_params()
348363

349364
# MoE MLP params per layer
350365
moe_params_per_layer = (
351-
c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
366+
llm_config.num_local_experts
367+
* 3
368+
* llm_config.hidden_size
369+
* llm_config.intermediate_size
352370
)
353371
total_params = (
354-
c.num_hidden_layers * (attn_params + moe_params_per_layer)
372+
llm_config.num_hidden_layers * (attn_params + moe_params_per_layer)
355373
+ embedding_params
356374
)
357375

358376
# Convert to GB
359-
return total_params * c.bytes_per_parameter / 1e9
377+
return total_params * llm_config.bytes_per_parameter / 1e9
360378

361379

362380
def get_estimator(llm_config, **kwargs) -> MemoryEstimator:

0 commit comments

Comments
 (0)