oracle · elizjo · Sep 16, 2025 · Sep 12, 2025 · Sep 12, 2025 · Sep 12, 2025
@@ -1288,35 +1288,42 @@ def validate_deployment_params(
 
     def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]:
         """
-        For the CLI (set generate_table = True), generates the table (in rich diff) with valid
+        For the CLI (set by default, generate_table = True), generates the table (in rich diff) with valid
         GPU deployment shapes for the provided model and configuration.
 
         For the API (set generate_table = False), generates the JSON with valid
         GPU deployment shapes for the provided model and configuration.
 
-        Validates if recommendations are generated, calls method to construct the rich diff
-        table with the recommendation data.
+        Validates the input and determines whether recommendations are available.
 
         Parameters
         ----------
-        model_ocid : str
-        OCID of the model to recommend feasible compute shapes.
+        **kwargs
+            model_ocid : str
+                (Required) The OCID of the model to recommend feasible compute shapes for.
+            generate_table : bool, optional
+                If True, generate and return a rich-diff table; if False, return a JSON response (default is False).
+            compartment_id : str, optional
+                The OCID of the user's compartment to use for the recommendation.
 
         Returns
         -------
         Table (generate_table = True)
-            A table format for the recommendation report with compatible deployment shapes
-            or troubleshooting info citing the largest shapes if no shape is suitable.
+            If `generate_table` is True, a table displaying the recommendation report with compatible deployment shapes,
+            or troubleshooting info if no shape is suitable.
 
         ShapeRecommendationReport (generate_table = False)
-            A recommendation report with compatible deployment shapes, or troubleshooting info
-            citing the largest shapes if no shape is suitable.
+            If `generate_table` is False, a structured recommendation report with compatible deployment shapes,
+            or troubleshooting info and citing the largest shapes if no shape is suitable.
 
         Raises
         ------
         AquaValueError
-            If model type is unsupported by tool (no recommendation report generated)
+            If the model type is unsupported and no recommendation report can be generated.
         """
+        deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id"))
+        kwargs["deployment_config"] = deployment_config
+
         try:
             request = RequestRecommend(**kwargs)
         except ValidationError as e:

@@ -38,6 +38,14 @@
     "4bit": ["No smaller quantization available"],
 }
 
+RUNTIME_WEIGHTS = {
+    "use_bfloat16": "bfloat16",
+    "use_fp16": "float16",
+    "use_fp32": "float32",
+    "use_int8": "int8",
+    "use_int4": "int4",
+    "use_bfloat32": "bfloat32",
+}
 
 TEXT_GENERATION = "text_generation"
 SAFETENSORS = "safetensors"
@@ -78,14 +86,23 @@
 
 IN_FLIGHT_QUANTIZATION = {"4bit"}  # vLLM only supports 4bit in-flight-quantization
 
+VLLM_PARAMS_FAMILY = "VLLM_PARAMS"
+VLLM_ENV = "VLLM"
+
+QUANT_FLAG = "--quantization"
+WEIGHT_DTYPE_FLAG = "--dtype"
+MAX_MODEL_LEN_FLAG = "--max-model-len"
+
 TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
 
 VLLM_PARAMS = {
     "max_model_len": "--max-model-len",
     "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
+    "trust_remote_code": "--trust-remote-code",
 }
 
 DEFAULT_WEIGHT_SIZE = "float32"
+DEFAULT_MAX_SEQ_LEN = 4096
 
 BITS_AND_BYTES_8BIT = "8bit"
 BITS_AND_BYTES_4BIT = "4bit"

@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
         Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
         """
         seq_len = self.seq_len or self.llm_config.max_seq_len
-        c = self.llm_config
+        llm_config = self.llm_config
         kv_cache_dtype_bytes = QUANT_MAPPING.get(
-            c.weight_dtype, 2
+            llm_config.weight_dtype, 2
         )  # vLLM uses model's weight applied to KV cache
 
         total_bytes = (
             self.batch_size
-            * c.num_hidden_layers
+            * llm_config.num_hidden_layers
             * 2
-            * c.num_attention_heads
+            * llm_config.num_attention_heads
             * seq_len
-            * c.head_dim
+            * llm_config.head_dim
             * kv_cache_dtype_bytes
         )
         return total_bytes / 1e9
@@ -69,15 +69,17 @@ def model_memory(self) -> float:
 
         Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
         """
-        c = self.llm_config
-        embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
+        llm_config = self.llm_config
+        embedding_count = 1 if llm_config.tie_word_embeddings else 2
         embedding_params = (
-            embedding_count * c.vocab_size * c.hidden_size
+            embedding_count * llm_config.vocab_size * llm_config.hidden_size
         )  # input and output untied
-        layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2)  # GPT-style
+        layer_params = (
+            12 * llm_config.num_hidden_layers * (llm_config.hidden_size**2)
+        )  # GPT-style
         num_params = layer_params + embedding_params
 
-        return num_params * c.bytes_per_parameter / 1e9
+        return num_params * llm_config.bytes_per_parameter / 1e9
 
     @property
     def total_memory(self) -> float:
@@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str:
         -------
             str: Parameter string for model deployment.
         """
-        c = self.llm_config
+        llm_config = self.llm_config
         params = []
-        if self.seq_len < c.max_seq_len:
+        if self.seq_len < llm_config.max_seq_len:
             params.append(VLLM_PARAMS["max_model_len"])
             params.append(str(self.seq_len))
 
         # Only suggest in-flight quantization for unquantized models when such quantization is requested
-        if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
+        if (
+            not llm_config.quantization
+            and llm_config.in_flight_quantization in IN_FLIGHT_QUANTIZATION
+        ):
             # vLLM only supports 4bit in-flight quantization
             params.append(VLLM_PARAMS["in_flight_quant"])
 
+        # add trust-remote-code if custom modules are specified
+        if llm_config.trust_remote_code:
+            params.append(VLLM_PARAMS["trust_remote_code"])
+
         params = " ".join(params) if params else ""
         return params
 
@@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str:
         wt_gb = self.model_memory
         batch_size = self.batch_size
         seq_len = self.seq_len
-        weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
+        weight_size = self.llm_config.weight_dtype
         config = self.llm_config
 
         suggested_quant_msg = None
         quant_advice = ", ".join(config.suggested_quantizations)
-        quantization = getattr(config, "quantization", None)
+        quantization = config.quantization
 
         advice = []
 
@@ -246,7 +255,7 @@ def limiting_factor(
             )
         else:
             advice = (
-                f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
+                f"Model fits well within the allowed compute shape "
                 f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
             )
         return advice
@@ -268,22 +277,22 @@ def model_memory(self) -> float:
         Returns estimated model parameter memory (in GB), accurately accounting
         for Llama-style attention and MLP, and tied or untied embeddings.
         """
-        c = self.llm_config
+        llm_config = self.llm_config
 
         embedding_params, attn_params = self._calc_attn_embed_params()
 
         # MLP params
-        gate_proj = c.hidden_size * c.intermediate_size
-        up_proj = c.hidden_size * c.intermediate_size
-        down_proj = c.intermediate_size * c.hidden_size
+        gate_proj = llm_config.hidden_size * llm_config.intermediate_size
+        up_proj = llm_config.hidden_size * llm_config.intermediate_size
+        down_proj = llm_config.intermediate_size * llm_config.hidden_size
         mlp_params = gate_proj + up_proj + down_proj
 
         # Total per-layer
         layer_params = attn_params + mlp_params
         # Total params
-        num_params = c.num_hidden_layers * layer_params + embedding_params
+        num_params = llm_config.num_hidden_layers * layer_params + embedding_params
 
-        return num_params * c.bytes_per_parameter / 1e9
+        return num_params * llm_config.bytes_per_parameter / 1e9
 
     @property
     def kv_cache_memory(self) -> float:
@@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float:
         Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
         num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
         """
-        c = self.llm_config
-        seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
-        kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
-        kv_heads = c.num_key_value_heads
+        llm_config = self.llm_config
+        seq_len = self.seq_len or llm_config.max_seq_len
+        kv_cache_dtype_bytes = QUANT_MAPPING.get(llm_config.weight_dtype, 2)
+        kv_heads = llm_config.num_key_value_heads
 
         total_bytes = (
             self.batch_size
-            * c.num_hidden_layers
+            * llm_config.num_hidden_layers
             * 2
             * kv_heads
             * seq_len
-            * c.head_dim
+            * llm_config.head_dim
             * kv_cache_dtype_bytes
         )
         return total_bytes / 1e9
@@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple:
         """
         Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
         """
-        c = self.llm_config
+        llm_config = self.llm_config
 
         # Embedding parameters
         # assume tied embeddings unless tie_word_embeddings = False
-        embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
-        embedding_params = embedding_count * c.vocab_size * c.hidden_size
+        embedding_count = 1 if llm_config.tie_word_embeddings else 2
+        embedding_params = (
+            embedding_count * llm_config.vocab_size * llm_config.hidden_size
+        )
 
-        q_proj = c.hidden_size * c.hidden_size
-        k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
-        v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
-        o_proj = c.hidden_size * c.hidden_size
+        q_proj = llm_config.hidden_size * llm_config.hidden_size
+        k_proj = llm_config.hidden_size * (
+            llm_config.num_key_value_heads * llm_config.head_dim
+        )
+        v_proj = llm_config.hidden_size * (
+            llm_config.num_key_value_heads * llm_config.head_dim
+        )
+        o_proj = llm_config.hidden_size * llm_config.hidden_size
         attn_params = q_proj + k_proj + v_proj + o_proj
 
         return embedding_params, attn_params
@@ -342,21 +357,24 @@ def model_memory(self) -> float:
 
         Returns the estimated memory size of the MoE Model (in GB).
         """
-        c = self.llm_config
+        llm_config = self.llm_config
         # Attention parameter count (Llama-style)
         embedding_params, attn_params = self._calc_attn_embed_params()
 
         # MoE MLP params per layer
         moe_params_per_layer = (
-            c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
+            llm_config.num_local_experts
+            * 3
+            * llm_config.hidden_size
+            * llm_config.intermediate_size
         )
         total_params = (
-            c.num_hidden_layers * (attn_params + moe_params_per_layer)
+            llm_config.num_hidden_layers * (attn_params + moe_params_per_layer)
             + embedding_params
         )
 
         # Convert to GB
-        return total_params * c.bytes_per_parameter / 1e9
+        return total_params * llm_config.bytes_per_parameter / 1e9
 
 
 def get_estimator(llm_config, **kwargs) -> MemoryEstimator: