@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
46
46
Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
47
47
"""
48
48
seq_len = self .seq_len or self .llm_config .max_seq_len
49
- c = self .llm_config
49
+ llm_config = self .llm_config
50
50
kv_cache_dtype_bytes = QUANT_MAPPING .get (
51
- c .weight_dtype , 2
51
+ llm_config .weight_dtype , 2
52
52
) # vLLM uses model's weight applied to KV cache
53
53
54
54
total_bytes = (
55
55
self .batch_size
56
- * c .num_hidden_layers
56
+ * llm_config .num_hidden_layers
57
57
* 2
58
- * c .num_attention_heads
58
+ * llm_config .num_attention_heads
59
59
* seq_len
60
- * c .head_dim
60
+ * llm_config .head_dim
61
61
* kv_cache_dtype_bytes
62
62
)
63
63
return total_bytes / 1e9
@@ -69,15 +69,17 @@ def model_memory(self) -> float:
69
69
70
70
Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
71
71
"""
72
- c = self .llm_config
73
- embedding_count = 1 if getattr ( c , " tie_word_embeddings" , True ) else 2
72
+ llm_config = self .llm_config
73
+ embedding_count = 1 if llm_config . tie_word_embeddings else 2
74
74
embedding_params = (
75
- embedding_count * c .vocab_size * c .hidden_size
75
+ embedding_count * llm_config .vocab_size * llm_config .hidden_size
76
76
) # input and output untied
77
- layer_params = 12 * c .num_hidden_layers * (c .hidden_size ** 2 ) # GPT-style
77
+ layer_params = (
78
+ 12 * llm_config .num_hidden_layers * (llm_config .hidden_size ** 2 )
79
+ ) # GPT-style
78
80
num_params = layer_params + embedding_params
79
81
80
- return num_params * c .bytes_per_parameter / 1e9
82
+ return num_params * llm_config .bytes_per_parameter / 1e9
81
83
82
84
@property
83
85
def total_memory (self ) -> float :
@@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str:
120
122
-------
121
123
str: Parameter string for model deployment.
122
124
"""
123
- c = self .llm_config
125
+ llm_config = self .llm_config
124
126
params = []
125
- if self .seq_len < c .max_seq_len :
127
+ if self .seq_len < llm_config .max_seq_len :
126
128
params .append (VLLM_PARAMS ["max_model_len" ])
127
129
params .append (str (self .seq_len ))
128
130
129
131
# Only suggest in-flight quantization for unquantized models when such quantization is requested
130
- if not c .quantization and c .in_flight_quantization in IN_FLIGHT_QUANTIZATION :
132
+ if (
133
+ not llm_config .quantization
134
+ and llm_config .in_flight_quantization in IN_FLIGHT_QUANTIZATION
135
+ ):
131
136
# vLLM only supports 4bit in-flight quantization
132
137
params .append (VLLM_PARAMS ["in_flight_quant" ])
133
138
139
+ # add trust-remote-code if custom modules are specified
140
+ if llm_config .trust_remote_code :
141
+ params .append (VLLM_PARAMS ["trust_remote_code" ])
142
+
134
143
params = " " .join (params ) if params else ""
135
144
return params
136
145
@@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str:
154
163
wt_gb = self .model_memory
155
164
batch_size = self .batch_size
156
165
seq_len = self .seq_len
157
- weight_size = getattr ( self .llm_config , " weight_dtype" , "unknown" )
166
+ weight_size = self .llm_config . weight_dtype
158
167
config = self .llm_config
159
168
160
169
suggested_quant_msg = None
161
170
quant_advice = ", " .join (config .suggested_quantizations )
162
- quantization = getattr ( config , " quantization" , None )
171
+ quantization = config . quantization
163
172
164
173
advice = []
165
174
@@ -246,7 +255,7 @@ def limiting_factor(
246
255
)
247
256
else :
248
257
advice = (
249
- f"No override PARAMS needed. \n \n Model fits well within the allowed compute shape "
258
+ f"Model fits well within the allowed compute shape "
250
259
f"({ required :.1f} GB used / { allowed_gpu_memory :.1f} GB allowed)."
251
260
)
252
261
return advice
@@ -268,22 +277,22 @@ def model_memory(self) -> float:
268
277
Returns estimated model parameter memory (in GB), accurately accounting
269
278
for Llama-style attention and MLP, and tied or untied embeddings.
270
279
"""
271
- c = self .llm_config
280
+ llm_config = self .llm_config
272
281
273
282
embedding_params , attn_params = self ._calc_attn_embed_params ()
274
283
275
284
# MLP params
276
- gate_proj = c .hidden_size * c .intermediate_size
277
- up_proj = c .hidden_size * c .intermediate_size
278
- down_proj = c .intermediate_size * c .hidden_size
285
+ gate_proj = llm_config .hidden_size * llm_config .intermediate_size
286
+ up_proj = llm_config .hidden_size * llm_config .intermediate_size
287
+ down_proj = llm_config .intermediate_size * llm_config .hidden_size
279
288
mlp_params = gate_proj + up_proj + down_proj
280
289
281
290
# Total per-layer
282
291
layer_params = attn_params + mlp_params
283
292
# Total params
284
- num_params = c .num_hidden_layers * layer_params + embedding_params
293
+ num_params = llm_config .num_hidden_layers * layer_params + embedding_params
285
294
286
- return num_params * c .bytes_per_parameter / 1e9
295
+ return num_params * llm_config .bytes_per_parameter / 1e9
287
296
288
297
@property
289
298
def kv_cache_memory (self ) -> float :
@@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float:
293
302
Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
294
303
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
295
304
"""
296
- c = self .llm_config
297
- seq_len = self .seq_len or getattr ( c , " max_seq_len" , 2048 )
298
- kv_cache_dtype_bytes = QUANT_MAPPING .get (c .weight_dtype , 2 )
299
- kv_heads = c .num_key_value_heads
305
+ llm_config = self .llm_config
306
+ seq_len = self .seq_len or llm_config . max_seq_len
307
+ kv_cache_dtype_bytes = QUANT_MAPPING .get (llm_config .weight_dtype , 2 )
308
+ kv_heads = llm_config .num_key_value_heads
300
309
301
310
total_bytes = (
302
311
self .batch_size
303
- * c .num_hidden_layers
312
+ * llm_config .num_hidden_layers
304
313
* 2
305
314
* kv_heads
306
315
* seq_len
307
- * c .head_dim
316
+ * llm_config .head_dim
308
317
* kv_cache_dtype_bytes
309
318
)
310
319
return total_bytes / 1e9
@@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple:
313
322
"""
314
323
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
315
324
"""
316
- c = self .llm_config
325
+ llm_config = self .llm_config
317
326
318
327
# Embedding parameters
319
328
# assume tied embeddings unless tie_word_embeddings = False
320
- embedding_count = 1 if getattr (c , "tie_word_embeddings" , True ) else 2
321
- embedding_params = embedding_count * c .vocab_size * c .hidden_size
329
+ embedding_count = 1 if llm_config .tie_word_embeddings else 2
330
+ embedding_params = (
331
+ embedding_count * llm_config .vocab_size * llm_config .hidden_size
332
+ )
322
333
323
- q_proj = c .hidden_size * c .hidden_size
324
- k_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
325
- v_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
326
- o_proj = c .hidden_size * c .hidden_size
334
+ q_proj = llm_config .hidden_size * llm_config .hidden_size
335
+ k_proj = llm_config .hidden_size * (
336
+ llm_config .num_key_value_heads * llm_config .head_dim
337
+ )
338
+ v_proj = llm_config .hidden_size * (
339
+ llm_config .num_key_value_heads * llm_config .head_dim
340
+ )
341
+ o_proj = llm_config .hidden_size * llm_config .hidden_size
327
342
attn_params = q_proj + k_proj + v_proj + o_proj
328
343
329
344
return embedding_params , attn_params
@@ -342,21 +357,24 @@ def model_memory(self) -> float:
342
357
343
358
Returns the estimated memory size of the MoE Model (in GB).
344
359
"""
345
- c = self .llm_config
360
+ llm_config = self .llm_config
346
361
# Attention parameter count (Llama-style)
347
362
embedding_params , attn_params = self ._calc_attn_embed_params ()
348
363
349
364
# MoE MLP params per layer
350
365
moe_params_per_layer = (
351
- c .num_local_experts * 3 * c .hidden_size * c .intermediate_size
366
+ llm_config .num_local_experts
367
+ * 3
368
+ * llm_config .hidden_size
369
+ * llm_config .intermediate_size
352
370
)
353
371
total_params = (
354
- c .num_hidden_layers * (attn_params + moe_params_per_layer )
372
+ llm_config .num_hidden_layers * (attn_params + moe_params_per_layer )
355
373
+ embedding_params
356
374
)
357
375
358
376
# Convert to GB
359
- return total_params * c .bytes_per_parameter / 1e9
377
+ return total_params * llm_config .bytes_per_parameter / 1e9
360
378
361
379
362
380
def get_estimator (llm_config , ** kwargs ) -> MemoryEstimator :
0 commit comments